#———————————————————————————————————————————– #IMPORTING NECESSARY PACKAGES.

library(data.table) # used for reading and manipulation of data
library(dplyr)      # used for data manipulation and joining
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)    # used for ploting 
library(caret)      # used for modeling
## Loading required package: lattice
library(corrplot)   # used for making correlation plot
## corrplot 0.84 loaded
library(xgboost)    # used for building XGBoost model
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(cowplot)    # used for combining multiple plots
library(caretEnsemble)
## 
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
## 
##     autoplot
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

#———————————————————————————————————————————–

MODULE 1: DATA CLEANING AND PRE-PROCESSING.

#CODE TO BE FILLED
#Transforming "low fat" and "LF" to "Low Fat"

train_v9rqX0R_1_ <- read.csv("train_v9rqX0R (1).csv")

big_mart <- train_v9rqX0R_1_
index <- which(big_mart$Item_Fat_Content == "LF" | 
                 big_mart$Item_Fat_Content == "low fat")

big_mart[index, "Item_Fat_Content"] <- "Low Fat"


#Transforming "reg" to "Regular
index2 <- which(big_mart$Item_Fat_Content == "reg")

big_mart[index2, "Item_Fat_Content"] <- "Regular"

#Dropping Unused Levels
#Using kNN imputation for missing values
big_mart_imputed <- kNN(big_mart)
big_mart_imputed <- big_mart_imputed %>% 
  select(Item_Identifier:Item_Outlet_Sales)
#write.csv()
summary(big_mart_imputed)
##  Item_Identifier     Item_Weight     Item_Fat_Content   Item_Visibility  
##  Length:8523        Min.   : 4.555   Length:8523        Min.   :0.00000  
##  Class :character   1st Qu.: 9.000   Class :character   1st Qu.:0.02699  
##  Mode  :character   Median :12.600   Mode  :character   Median :0.05393  
##                     Mean   :12.838                      Mean   :0.06613  
##                     3rd Qu.:16.500                      3rd Qu.:0.09459  
##                     Max.   :21.350                      Max.   :0.32839  
##   Item_Type            Item_MRP      Outlet_Identifier 
##  Length:8523        Min.   : 31.29   Length:8523       
##  Class :character   1st Qu.: 93.83   Class :character  
##  Mode  :character   Median :143.01   Mode  :character  
##                     Mean   :140.99                     
##                     3rd Qu.:185.64                     
##                     Max.   :266.89                     
##  Outlet_Establishment_Year Outlet_Size        Outlet_Location_Type
##  Min.   :1985              Length:8523        Length:8523         
##  1st Qu.:1987              Class :character   Class :character    
##  Median :1999              Mode  :character   Mode  :character    
##  Mean   :1998                                                     
##  3rd Qu.:2004                                                     
##  Max.   :2009                                                     
##  Outlet_Type        Item_Outlet_Sales 
##  Length:8523        Min.   :   33.29  
##  Class :character   1st Qu.:  834.25  
##  Mode  :character   Median : 1794.33  
##                     Mean   : 2181.29  
##                     3rd Qu.: 3101.30  
##                     Max.   :13086.97
table(big_mart_imputed$Outlet_Identifier, big_mart_imputed$Outlet_Size)
##         
##              High Medium Small
##   OUT010 555    0      0     0
##   OUT013   0  932      0     0
##   OUT017 926    0      0     0
##   OUT018   0    0    928     0
##   OUT019   0    0      0   528
##   OUT027   0    0    935     0
##   OUT035   0    0      0   930
##   OUT045 929    0      0     0
##   OUT046   0    0      0   930
##   OUT049   0    0    930     0
table(big_mart_imputed$Outlet_Identifier, big_mart_imputed$Outlet_Type)
##         
##          Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
##   OUT010           555                 0                 0                 0
##   OUT013             0               932                 0                 0
##   OUT017             0               926                 0                 0
##   OUT018             0                 0               928                 0
##   OUT019           528                 0                 0                 0
##   OUT027             0                 0                 0               935
##   OUT035             0               930                 0                 0
##   OUT045             0               929                 0                 0
##   OUT046             0               930                 0                 0
##   OUT049             0               930                 0                 0
table(big_mart$Outlet_Type, big_mart_imputed$Outlet_Size)
##                    
##                          High Medium Small
##   Grocery Store      555    0      0   528
##   Supermarket Type1 1855  932    930  1860
##   Supermarket Type2    0    0    928     0
##   Supermarket Type3    0    0    935     0
index3 <- which(big_mart_imputed$Outlet_Identifier == "OUT010")
big_mart_imputed[index3, "Outlet_Size"] <- "Small"
index4 <- which(big_mart_imputed$Outlet_Identifier == "OUT017")
big_mart_imputed[index4, "Outlet_Size"] <- "Small"
index5 <- which(big_mart_imputed$Outlet_Identifier == "OUT045")
big_mart_imputed[index5, "Outlet_Size"] <- "Small"
big_mart_imputed$Outlet_Size <- factor(big_mart_imputed$Outlet_Size)
summary(big_mart_imputed)
##  Item_Identifier     Item_Weight     Item_Fat_Content   Item_Visibility  
##  Length:8523        Min.   : 4.555   Length:8523        Min.   :0.00000  
##  Class :character   1st Qu.: 9.000   Class :character   1st Qu.:0.02699  
##  Mode  :character   Median :12.600   Mode  :character   Median :0.05393  
##                     Mean   :12.838                      Mean   :0.06613  
##                     3rd Qu.:16.500                      3rd Qu.:0.09459  
##                     Max.   :21.350                      Max.   :0.32839  
##   Item_Type            Item_MRP      Outlet_Identifier 
##  Length:8523        Min.   : 31.29   Length:8523       
##  Class :character   1st Qu.: 93.83   Class :character  
##  Mode  :character   Median :143.01   Mode  :character  
##                     Mean   :140.99                     
##                     3rd Qu.:185.64                     
##                     Max.   :266.89                     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985              High  : 932   Length:8523         
##  1st Qu.:1987              Medium:2793   Class :character    
##  Median :1999              Small :4798   Mode  :character    
##  Mean   :1998                                                
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##  Outlet_Type        Item_Outlet_Sales 
##  Length:8523        Min.   :   33.29  
##  Class :character   1st Qu.:  834.25  
##  Mode  :character   Median : 1794.33  
##                     Mean   : 2181.29  
##                     3rd Qu.: 3101.30  
##                     Max.   :13086.97
#write.csv(big_mart_imputed,"C:/Users/jayanth/Desktop/Data analytics lab/sales_cleaned.csv", row.names = FALSE)


sales_cleaned <- read.csv("sales_cleaned.csv")

#———————————————————————————————————————————–

MODULE 2: DATA EXPLORATION - ANSWERING QUESTIONS ABOUT THE DATASET.

#———————————————————————————————————————————–

#1. What are the types of outlets, outlet locations, and outlet sizes in the given dataset and what is their count?

table(sales_cleaned$Outlet_Location_Type)
## 
## Tier 1 Tier 2 Tier 3 
##   2388   2785   3350
table(sales_cleaned$Outlet_Type)
## 
##     Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3 
##              1083              5577               928               935
table(sales_cleaned$Outlet_Size)
## 
##   High Medium  Small 
##    932   3722   3869

INFERENCE:

A. 3 TYPES OF OUTLETS LOCATIONS - TIER 1,2,3

B. 4 TYPES OF OUTLET TYPES :

i) GROCERY STORE

ii) SUPERMARKET TYPE 1

iv) SUPERMARKET TYPE 3

C. 3 TYPES OF OUTLET SIZES : SMALL, MEDIUM AND HIGH.

#Q2. WHAT IS THE RELATIONSHIP BETWEEN OUTLET TYPE AND OUTLET SIZE?

table(sales_cleaned$Outlet_Type , sales_cleaned$Outlet_Size)
##                    
##                     High Medium Small
##   Grocery Store        0      0  1083
##   Supermarket Type1  932   1859  2786
##   Supermarket Type2    0    928     0
##   Supermarket Type3    0    935     0

#Q3.COUNT OF DIFFERENT ITEM TYPES IN THE VARIOUS OUTLET TYPES.

table(sales_cleaned$Item_Type , sales_cleaned$Outlet_Type)
##                        
##                         Grocery Store Supermarket Type1 Supermarket Type2
##   Baking Goods                     85               426                68
##   Breads                           33               160                27
##   Breakfast                        19                68                12
##   Canned                           73               426                78
##   Dairy                            92               450                73
##   Frozen Foods                    103               572                92
##   Fruits and Vegetables           152               805               135
##   Hard Drinks                      24               145                22
##   Health and Hygiene               67               335                58
##   Household                       119               597                95
##   Meat                             66               257                46
##   Others                           27               107                20
##   Seafood                          10                40                 7
##   Snack Foods                     146               785               132
##   Soft Drinks                      54               300                46
##   Starchy Foods                    13               104                17
##                        
##                         Supermarket Type3
##   Baking Goods                         69
##   Breads                               31
##   Breakfast                            11
##   Canned                               72
##   Dairy                                67
##   Frozen Foods                         89
##   Fruits and Vegetables               140
##   Hard Drinks                          23
##   Health and Hygiene                   60
##   Household                            99
##   Meat                                 56
##   Others                               15
##   Seafood                               7
##   Snack Foods                         137
##   Soft Drinks                          45
##   Starchy Foods                        14
data1 <- sales_cleaned

#Q4. THE COUNT OF THE OUTLET SIZES IN EACH OUTLET LOCATION.

table(data1$Outlet_Location_Type , data1$Outlet_Size)
##         
##          High Medium Small
##   Tier 1    0    930  1458
##   Tier 2    0    929  1856
##   Tier 3  932   1863   555

#Q5. ITEM TYPE AND IT’S AVERAGE MRP

df3 <- data1 %>% group_by(Item_Type )  %>% summarise(mean_Item_MRP = mean(Item_MRP, na.rm=TRUE))
df3

#Q6. AVERAGE OUTLET SALES OF EACH OUTLET LOCATION TYPE.

df4 <- data1 %>% group_by(Outlet_Location_Type )  %>% summarise(mean_outlet_sales = mean(Item_Outlet_Sales, na.rm=TRUE)) %>% arrange(desc(mean_outlet_sales))
df4

#———————————————————————————————————————————-

#Q7. WHAT IS THE RELATION BETWEEN THE SALES AMOUNT AND THE TYPE OF OUTLET?

sales_cleaned %>% group_by(Outlet_Type) %>% summarise(Mean_sales_per_outlet= mean(Item_Outlet_Sales)) %>% arrange(desc(Mean_sales_per_outlet))

#Q8. HOW ARE OUTLET LOCATION AND OUTLET TYPE RELATED?

#sales_cleaned %>% select(Outlet_Location_Type,Outlet_Type) %>% group_by(Outlet_Location_Type) %>% summarise()
table(sales_cleaned$Outlet_Location_Type, sales_cleaned$Outlet_Type)
##         
##          Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
##   Tier 1           528              1860                 0                 0
##   Tier 2             0              2785                 0                 0
##   Tier 3           555               932               928               935

#Q9. RELATIONSHIP BETWEEN YEAR OF ESTABLSIHMENT OF AN OUTLET AND THE OUTLET TYPE.

table(sales_cleaned$Outlet_Establishment_Year, sales_cleaned$Outlet_Type)
##       
##        Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
##   1985           528                 0                 0               935
##   1987             0               932                 0                 0
##   1997             0               930                 0                 0
##   1998           555                 0                 0                 0
##   1999             0               930                 0                 0
##   2002             0               929                 0                 0
##   2004             0               930                 0                 0
##   2007             0               926                 0                 0
##   2009             0                 0               928                 0

#Q10. RELATIONSHIP BETWEEN THE MRP AND THE ITEM VISIBILTIY.

res <- cor.test(sales_cleaned$Item_MRP, sales_cleaned$Item_Visibility, 
                    method = "pearson")
res
## 
##  Pearson's product-moment correlation
## 
## data:  sales_cleaned$Item_MRP and sales_cleaned$Item_Visibility
## t = -0.12137, df = 8521, p-value = 0.9034
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.02254486  0.01991635
## sample estimates:
##          cor 
## -0.001314848

INFERENCE: P VALUE IS > 0.05, HENCE NO SIGNIFICANT CORRELATION.

#Q11. RELATIONSHIP BETWEEN THE ITEM TYPE AND VISIBILITY.

sales_cleaned %>% select(Item_Visibility, Item_Type) %>% group_by(Item_Type) %>% summarise(avg_visibility= mean(Item_Visibility)) %>% arrange(desc(avg_visibility))

#Q12. RELATIONSHIP BETWEEN SALES AND OUTLET SIZE?

sales_cleaned %>% group_by(Outlet_Size) %>% summarise(Mean_sales_per_outlet= mean(Item_Outlet_Sales)) %>% arrange(desc(Mean_sales_per_outlet))

#Q13. AVERAGE SALES BY OUTLET SIZE.

df6 <- data1 %>% group_by(Outlet_Size )  %>% summarise(mean_outlet_sales = mean(Item_Outlet_Sales, na.rm=TRUE)) %>% arrange(desc(mean_outlet_sales))
df6

#———————————————————————————————————————————-

MODULE 3. DATA VISUALIZATION.

#1. COUNT OF RETAIL CHAINS ESTABLISHED EVERY YEAR.

df1 <- sales_cleaned
x <- ggplot(data=sales_cleaned,aes(x=Outlet_Establishment_Year)) + geom_histogram(binwidth = 01, color='red',fill='blue') + xlab('YEAR OF ESTABLISHMENT') + ylab('Frequency') + ggtitle('HISTOGRAM PLOT OF YEAR OF ESTABLISHMENT')

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:xgboost':
## 
##     slice
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplotly(x)
p0 <- ggplot(sales_cleaned) + geom_histogram(aes(sales_cleaned$Item_Outlet_Sales), binwidth = 100, fill = "darkgreen") +
  xlab("Item_Outlet_Sales")

p0
## Warning: Use of `sales_cleaned$Item_Outlet_Sales` is discouraged. Use
## `Item_Outlet_Sales` instead.

library(cowplot)
combi=sales_cleaned
p1 = ggplot(combi) + geom_histogram(aes(Item_Weight), binwidth = 0.5, fill = "blue")
p2 = ggplot(combi) + geom_histogram(aes(Item_Visibility), binwidth = 0.005, fill = "blue")
p3 = ggplot(combi) + geom_histogram(aes(Item_MRP), binwidth = 1, fill = "blue")
plot_grid(p1, p2, p3, nrow = 1) # plot_grid() from cowplot package

#Independent Variables(categorical variables)

ggplot(combi %>% group_by(Item_Fat_Content) %>% summarise(Count = n())) + 
  geom_bar(aes(Item_Fat_Content, Count), stat = "identity", fill = "coral1")

# plot for Item_Type
p4 = ggplot(combi %>% group_by(Item_Type) %>% summarise(Count = n())) + 
  geom_bar(aes(Item_Type, Count), stat = "identity", fill = "coral1") +
  xlab("") +
  geom_label(aes(Item_Type, Count, label = Count), vjust = 0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  ggtitle("Item_Type")

ggplotly(p4)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
p5 = ggplot(combi %>% group_by(Outlet_Identifier) %>% summarise(Count = n())) + 
  geom_bar(aes(Outlet_Identifier, Count), stat = "identity", fill = "coral1") +
  geom_label(aes(Outlet_Identifier, Count, label = Count), vjust = 0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplotly(p5)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
p6 = ggplot(combi %>% group_by(Outlet_Size) %>% summarise(Count = n())) + 
  geom_bar(aes(Outlet_Size, Count), stat = "identity", fill = "coral1") +
  geom_label(aes(Outlet_Size, Count, label = Count), vjust = 0.5) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

second_row = plot_grid(p5, p6, nrow = 1)
plot_grid(p4, second_row, ncol = 1)

ggplotly(p6)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
library(ggplot2)
p8 = ggplot(combi %>% group_by(Outlet_Type) %>% summarise(Count = n())) + 
  geom_bar(aes(Outlet_Type, Count), stat = "identity", fill = "coral1") +
  geom_label(aes(factor(Outlet_Type), Count, label = Count), vjust = 0.5) +
  theme(axis.text.x = element_text(size = 8.5))

library(plotly)
ggplotly(p8)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabel() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
# Item_Visibility vs Item_Outlet_Sales
p10 = ggplot(sales_cleaned) + 
  geom_point(aes(Item_Visibility, Item_Outlet_Sales), colour = "violet", alpha = 0.3) +
  theme(axis.title = element_text(size = 8.5))

p10

#item type vs outlet sales

#item type vs outlet sales
qplot(x = sqrt(Item_Outlet_Sales), data = data1, binwidth = 1,
      ylab = "Sales Distribution",
      xlab = "SQRT of Outlet Sales",
      fill=I("tomato")) +
  theme(axis.text.x = element_text(angle = 90),
        axis.text.y = element_text(angle = 30)) +
  theme_minimal() +
  scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
  scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
  facet_wrap(~Item_Type)
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 32 rows containing missing values (geom_bar).

qplot(x = Item_Type, y = sqrt(Item_Outlet_Sales),
      ylab = "SQRT of Outlet Sales",
      data = data1,
      geom = "boxplot",
      fill=I("tomato")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90),
        axis.text.y = element_text(angle = 30))

#outlet identifier vs outlet sales

qplot(x = sqrt(Item_Outlet_Sales), data = data1, binwidth = 1,
      ylab = "Sales Distribution",
      xlab = "SQRT of Outlet Sales",
      fill=I("tomato")) +
  theme(axis.text.x = element_text(angle = 90),
        axis.text.y = element_text(angle = 30)) +
  theme_minimal() +
  scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
  scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
  facet_wrap(~Outlet_Identifier,2)
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 23 rows containing missing values (geom_bar).

qplot(x = Outlet_Identifier, y = sqrt(Item_Outlet_Sales), 
            color = Outlet_Type,
            ylab = "SQRT of Outlet Sales",
            data = data1,
            geom = "boxplot",
            fill=Outlet_Type) +
  geom_vline(aes(xintercept=quantile(data1$Item_Outlet_Sales, c(.01))),  color="red", linetype="dashed", size=1) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90),
        axis.text.y = element_text(angle = 30))
## Warning: Use of `data1$Item_Outlet_Sales` is discouraged. Use
## `Item_Outlet_Sales` instead.

#outlet sales vs fat content

 qplot(x = sqrt(Item_Outlet_Sales), data = data1, binwidth = 1,
       ylab = "",
       xlab = "SQRT of Outlet Sales",
       fill=I("tomato")) +
   theme(axis.text.x = element_text(angle = 90),
         axis.text.y = element_text(angle = 30)) +
   scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
   scale_y_continuous(limits = c(0,200), breaks = seq(0,200,100)) +
   theme_minimal() +
   facet_wrap(~Item_Fat_Content)
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).

  qplot(x = Item_Fat_Content, y = sqrt(Item_Outlet_Sales),
       ylab = "SQRT of Outlet Sales",
       data = data1,
       geom = "boxplot",
       fill=I("tomato")) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 0),
         axis.text.y = element_text(angle = 30))

#outlet size vs outlet sales

  qplot(x = sqrt(Item_Outlet_Sales), data = sales_cleaned, binwidth = 0.5,
        ylab = "Sales Distribution",
        xlab = "SQRT of Outlet Sales",
        fill=I("tomato")) +
    theme(axis.text.x = element_text(angle = 90),
          axis.text.y = element_text(angle = 30)) +
    theme_minimal() +
    scale_x_continuous(limits = c(0,100), breaks = seq(0,120,15)) +
    scale_y_continuous(limits = c(0,40), breaks = seq(0,40,10)) +
    facet_wrap(~Outlet_Size)
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 41 rows containing missing values (geom_bar).

  qplot(x = Outlet_Size, y = sqrt(Item_Outlet_Sales),
        ylab = "SQRT of Outlet Sales",
        data = data1,
        geom = "boxplot",
        fill=I("tomato")) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90),
          axis.text.y = element_text(angle = 30))

getwd()
## [1] "/Users/roviros/Desktop/EDA THEORY"